library(tidyverse) #importing, tidying, plotting data
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr) #making tables
library(leaflet)
library(tinytex) #may need for knitting pdf versions of .rmd file
library(ggplot2)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(ggfortify)
#Body condition factor is BCF = ((body weight(g) - gut content(g)) / length(cm)^3) *100.
library(readr)
Site_BCF <- read_csv("data/Site_BCF.csv")
## Rows: 21 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): Site
## dbl (2): Site_Location, Body_Condition_Factor
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(Site_BCF,aes(Body_Condition_Factor,Site)) +
geom_point() +
geom_smooth(method="glm", method.args=list(family="binomial"(link="logit")))+
geom_hline(yintercept = mean(Site_BCF$Site), linetype = "dashed") +
labs(title="GLM, binomial count (DF/WB)")
## Warning in mean.default(Site_BCF$Site): argument is not numeric or logical:
## returning NA
## `geom_smooth()` using formula 'y ~ x'
## Warning: Computation failed in `stat_smooth()`:
## y values must be 0 <= y <= 1
## Warning: Removed 1 rows containing missing values (geom_hline).
#Binomial showing Body Condition Factor values of individuals across two study sites: Wall Branch (WB) creek system and Dry Fork (DF), Whites creek system.
library(readr)
Site_BodyConditionFactor <- read_csv("data/Site_BodyConditionFactor.csv")
## Rows: 21 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): Site_Location, Body_Condition_Factor
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Here, the two sites are assigned as 1= Wall Branch and 0= Dry Fork #Scatter plot with geom_smooth applied
ggplot(Site_BodyConditionFactor,aes(Body_Condition_Factor,Site_Location)) +
geom_point() +
geom_smooth() +
xlab ("Body Condition Factor") +
ylab ("Site") +
labs(title="Raw Fit: 1=Wall Branch, 0=Dry Fork")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Site_BodyConditionFactor$BCF100 <- Site_BodyConditionFactor$Body_Condition_Factor/100
fit.1 <- glm(Site_Location~BCF100, data=Site_BodyConditionFactor, binomial(link="logit"))
summary(fit.1)
##
## Call:
## glm(formula = Site_Location ~ BCF100, family = binomial(link = "logit"),
## data = Site_BodyConditionFactor)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.249 -1.159 -1.008 1.212 1.371
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.967 4.650 0.423 0.672
## BCF100 -257.309 577.880 -0.445 0.656
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 29.065 on 20 degrees of freedom
## Residual deviance: 28.864 on 19 degrees of freedom
## AIC: 32.864
##
## Number of Fisher Scoring iterations: 4
autoplot(fit.1)
library(arm)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: lme4
##
## arm (Version 1.11-2, built: 2020-7-27)
## Working directory is C:/Users/ridew/OneDrive/Documents/Advanced Analytics/GLM_Assignment/GLMAssignment
x <- predict(fit.1)
y <- resid(fit.1)
binnedplot(x, y)
#The results of the binnedplot, show that binned residuals of this data do not present good binary data. The binned residual plot is used to view points that fall into +/- 2 standard errors; ~95% of the binned residuals.
coef(fit.1)
## (Intercept) BCF100
## 1.966753 -257.308577
confint(fit.1)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -7.210625 11.7298
## BCF100 -1470.488665 880.3547
ggplot(Site_BodyConditionFactor, aes(BCF100,Site_Location)) +
geom_point() +
geom_smooth(method="glm", method.args=list(family="binomial"(link="logit"))) +
xlab ("Body Condition Factor") +
ylab ("Site") +
labs (title="Raw Fit: 1=Wall Branch, 0=Dry Fork")
## `geom_smooth()` using formula 'y ~ x'
invlogit <- function(x) {1 / ( 1+exp(-x) ) }
invlogit(coef(fit.1))
## (Intercept) BCF100
## 8.772620e-01 1.787741e-112
#The body codition values were simialr across the site werealomst exactly identical, making this dataset not appropriate predictor of what population you belong to. #For the summary table, we see the binary data has a negative effect on transformed BFC100 data.
Figure 1. Length Measurements
Figure 2. Fathead Minnow Gonads (Testes)
library(readr)
GSI_RedColorationArea <- read_csv("data/GSI_RedColorationArea.csv")
## Rows: 21 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): GSI_Value, Red_Coloration_Area
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(data = GSI_RedColorationArea, aes(x=GSI_Value, y=Red_Coloration_Area)) +
geom_point()+
xlab("GSI") +
ylab("Red Coloration %") +
theme_bw()+
geom_smooth(method=lm)
## `geom_smooth()` using formula 'y ~ x'
ggsave("outputs/ScatterGSI_Area.png")
## Saving 7 x 5 in image
## `geom_smooth()` using formula 'y ~ x'
FitGSI_Area <- lm(data= GSI_RedColorationArea, Red_Coloration_Area~GSI_Value)
autoplot(FitGSI_Area)
anova(FitGSI_Area)
## Analysis of Variance Table
##
## Response: Red_Coloration_Area
## Df Sum Sq Mean Sq F value Pr(>F)
## GSI_Value 1 13.37 13.368 0.2345 0.6337
## Residuals 19 1083.04 57.002
summary(FitGSI_Area)
##
## Call:
## lm(formula = Red_Coloration_Area ~ GSI_Value, data = GSI_RedColorationArea)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.129 -4.093 0.079 4.519 13.015
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.0913 2.5593 5.506 2.6e-05 ***
## GSI_Value -0.1511 0.3120 -0.484 0.634
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.55 on 19 degrees of freedom
## Multiple R-squared: 0.01219, Adjusted R-squared: -0.0398
## F-statistic: 0.2345 on 1 and 19 DF, p-value: 0.6337
#results of gaussian shows a 0.6337 p-value
boxcox(FitGSI_Area)
# #see linear regression for FitGSI_Area
#These results suggest that a lambda value ~ 0.7 will be the best fit for this data.
#The R output for the boxcox() function plots the maximum likelihood surface (the curve) together with a maximum likelihood-based 95% CI (Hector, 2015) #These results suggest that a lambda value ~ 0.7 will be the best fit for this data.
#Helpful GLM component info from Hector, 2015 Ch8
#GLMs have three components: # 1.) a linear predictor- is what comes after the tilde (~) in our linear model formula # 2.) a variance function - models the variation in the data;make use of a much wider rangefamily of distributions including the poisson, the binomial, and the gamma. #3.) a link function- plays a role equivalent to the transformation in normal least squares models. However, rather than transforming the data we transform the predictions made by the linear predictor. Commonly used link functions include the log, square root, and logistic.
GLMGSI_Area <- glm(Red_Coloration_Area~GSI_Value, data= GSI_RedColorationArea, family= gaussian(link=))
ggplot(GSI_RedColorationArea,aes(GSI_Value,Red_Coloration_Area)) +
geom_point() +
geom_smooth(method="glm",colour="red",
method.args=list(family="gaussian"(link=)))+
labs(title="GLM, Gaussian")
## `geom_smooth()` using formula 'y ~ x'
autoplot(GLMGSI_Area)
anova(GLMGSI_Area)
## Analysis of Deviance Table
##
## Model: gaussian, link: identity
##
## Response: Red_Coloration_Area
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 20 1096.4
## GSI_Value 1 13.368 19 1083.0
summary(GLMGSI_Area)
##
## Call:
## glm(formula = Red_Coloration_Area ~ GSI_Value, family = gaussian(link = ),
## data = GSI_RedColorationArea)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -11.129 -4.093 0.079 4.519 13.015
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.0913 2.5593 5.506 2.6e-05 ***
## GSI_Value -0.1511 0.3120 -0.484 0.634
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 57.00198)
##
## Null deviance: 1096.4 on 20 degrees of freedom
## Residual deviance: 1083.0 on 19 degrees of freedom
## AIC: 148.4
##
## Number of Fisher Scoring iterations: 2
gammalog <- glm(Red_Coloration_Area~GSI_Value, data= GSI_RedColorationArea, family=Gamma(link= "log"))
ggplot(GSI_RedColorationArea,aes(GSI_Value,Red_Coloration_Area)) +
geom_point() +
geom_smooth(method="glm",colour="red",
method.args=list(family="Gamma"(link="log")))+
labs(title="GLM, log link, Gamma variance")
## `geom_smooth()` using formula 'y ~ x'
anova(gammalog)
## Analysis of Deviance Table
##
## Model: Gamma, link: log
##
## Response: Red_Coloration_Area
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev
## NULL 20 9.2373
## GSI_Value 1 0.08659 19 9.1508
summary(gammalog)
##
## Call:
## glm(formula = Red_Coloration_Area ~ GSI_Value, family = Gamma(link = "log"),
## data = GSI_RedColorationArea)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.33051 -0.33911 0.00496 0.29534 0.75609
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.65341 0.19319 13.735 2.57e-11 ***
## GSI_Value -0.01268 0.02355 -0.538 0.597
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.324797)
##
## Null deviance: 9.2373 on 20 degrees of freedom
## Residual deviance: 9.1508 on 19 degrees of freedom
## AIC: 148.62
##
## Number of Fisher Scoring iterations: 5
coef(gammalog)
## (Intercept) GSI_Value
## 2.65341089 -0.01268145
confint(gammalog)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 2.2800283 3.05155112
## GSI_Value -0.0578871 0.03754406
leaflet() %>%
setView(-86.854396, 36.26361 , zoom = 16) %>% #lat-long of the place of interest
addTiles() %>%
addProviderTiles('Esri.WorldImagery') %>%
addMarkers(-86.854396, 36.26361 , popup = "Dry Fork, Whites Creek System")
leaflet() %>%
setView(-87.287965, 36.499277 , zoom = 16) %>% #lat-long of the place of interest
addTiles() %>%
addProviderTiles('Esri.WorldImagery') %>%
addMarkers(-87.287965, 36.499277 , popup = "Rotary Park, Wall Branch Creek System")
Figure 3. Southern Redbelly Dace in Full Breeding Coloration (Chrosomus erythrogaster)